/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: November 2024

Inputs: 	pdb_full_structures.dta
			pdb_full_papers.dta
			pdb_analysis.dta
			phat.dta
			
Outputs:	Tables 2, E4, E6
			Figures 4, 5, 6, E5, E6

Purpose: 	Test whether potential is associated with:
				- more competition
				- more investment
				- shorter maturation
				- lower quality

*******************************************************************************/

* Assemble data
clear all
use "${data_built}pdb_full_structures.dta"
merge 1:1 structureId using "${data_built}pdb_analysis.dta"
assert _merge != 2
keep if _merge == 3
drop _merge

merge m:1 pubmedId using "${data_built}pdb_full_papers.dta"
keep if _merge != 2
drop _merge

merge 1:1 structureId using "${data_built}phat.dta"
assert _merge == 3
drop _merge

* Only keep non-SG structures
keep if structuralGenomics == 0

* Generate a last author fixed effect
encode lastAuthorId, gen(lastAuthorId_num)

/*------------------------------------------------------------------------------

	Table 2: Effect of Potential on Competition, Maturation, Quality

------------------------------------------------------------------------------*/
 
* Initialize matrix to store results
matrix main_results = J(10,6,.)
local row = 1
local col = 1

* Regressions for competition, maturation, quality
foreach y in priorityRace maturation indexStd {
	
	* Main regression, year FEs
	reg `y' predPtileCites3 $time_controls, r
		matrix main_results[`row',`col'] = _b[predPtileCites3]
		local ++row
		matrix main_results[`row',`col'] = _se[predPtileCites3]
		local ++row
		local t = _b[predPtileCites3] / _se[predPtileCites3]
		matrix main_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix main_results[`row',`col'] = e(r2)
		local ++row
	
	* Main regression, year FEs + complexity controls
	reg `y' predPtileCites3 $time_controls $complexity_controls, r
		matrix main_results[`row',`col'] = _b[predPtileCites3]
		local ++row
		matrix main_results[`row',`col'] = _se[predPtileCites3]
		local ++row
		local t = _b[predPtileCites3] / _se[predPtileCites3]
		matrix main_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix main_results[`row',`col'] = e(r2)
		local ++row
		
		sum `y' if e(sample)
		matrix main_results[`row',`col'] = r(mean)
		local ++row
		matrix main_results[`row',`col'] = r(N)
		local row = 1
		local ++col
	
	* Main regression, year FEs + PI FEs
	areg `y' predPtileCites3 $time_controls, absorb(lastAuthorId_num) r
		matrix main_results[`row',`col'] = _b[predPtileCites3]
		local ++row
		matrix main_results[`row',`col'] = _se[predPtileCites3]
		local ++row
		local t = _b[predPtileCites3] / _se[predPtileCites3]
		matrix main_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix main_results[`row',`col'] = e(r2)
		local ++row
	
	* Main regression, year FEs + PI FEs  + complexity controls
	areg `y' predPtileCites3 $time_controls $complexity_controls, 			///
	absorb(lastAuthorId_num) r
		matrix main_results[`row',`col'] = _b[predPtileCites3]
		local ++row
		matrix main_results[`row',`col'] = _se[predPtileCites3]
		local ++row
		local t = _b[predPtileCites3] / _se[predPtileCites3]
		matrix main_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix main_results[`row',`col'] = e(r2)
		local ++row
		
		sum `y' if e(sample)
		matrix main_results[`row',`col'] = r(mean)
		local ++row
		matrix main_results[`row',`col'] = r(N)
		local row = 1
		local ++col
}

preserve
	clear
	svmat main_results
	export excel "${tables}Tables.xlsx", sheet(table2) cell(D60) sheetmodify
restore	

/*------------------------------------------------------------------------------

	Table E4: Effect of Potential on Quality (Additional Outcomes)

------------------------------------------------------------------------------*/
	
* Initialize matrix to store results
matrix add_qual_results = J(10,3,.)
local row = 1
local col = 1

* Regressions for competition, maturation, quality
foreach y in resolutionStd rFreeStd ramaRawStd {
	
	* Main regression, year FEs
	reg `y' predPtileCites3 $time_controls, r
		matrix add_qual_results[`row',`col'] = _b[predPtileCites3]
		local ++row
		matrix add_qual_results[`row',`col'] = _se[predPtileCites3]
		local ++row
		local t = _b[predPtileCites3] / _se[predPtileCites3]
		matrix add_qual_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix add_qual_results[`row',`col'] = e(r2)
		local ++row
	
	* Main regression, year FEs + complexity controls
	reg `y' predPtileCites3 $time_controls $complexity_controls, r
		matrix add_qual_results[`row',`col'] = _b[predPtileCites3]
		local ++row
		matrix add_qual_results[`row',`col'] = _se[predPtileCites3]
		local ++row
		local t = _b[predPtileCites3] / _se[predPtileCites3]
		matrix add_qual_results[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix add_qual_results[`row',`col'] = e(r2)
		local ++row
		
		sum `y' if e(sample)
		matrix add_qual_results[`row',`col'] = r(mean)
		local ++row
		matrix add_qual_results[`row',`col'] = r(N)
		local row = 1
		local ++col

}

preserve
	clear
	svmat add_qual_results
	export excel "${tables}Tables.xlsx", sheet(tableE4) cell(D60) sheetmodify
restore	

/*------------------------------------------------------------------------------

	Table E5: Effect of Potential on Quality (Journal Controls)

------------------------------------------------------------------------------*/

* Create numeric journal ID
encode journalName, gen(journalNum)

* Initialize matrix to store results
matrix journal_FE_regs = J(10,4,.)
local row = 1
local col = 1

* Loop over various quality measures
foreach Q in resolutionStd rFreeStd ramaRawStd indexStd {
	areg `Q' predPtileCites3 $time_controls, absorb(journalNum) vce(robust)
		matrix journal_FE_regs[`row',`col'] = _b[predPtileCites3]
		local ++row
		matrix journal_FE_regs[`row',`col'] = _se[predPtileCites3]
		local ++row
		local t = _b[predPtileCites3] / _se[predPtileCites3]
		matrix journal_FE_regs[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix journal_FE_regs[`row',`col'] = e(r2)
		local ++row
			
	areg `Q' predPtileCites3 $time_controls $complexity_controls, 			///
	absorb(journalNum) vce(robust)
		matrix journal_FE_regs[`row',`col'] = _b[predPtileCites3]
		local ++row
		matrix journal_FE_regs[`row',`col'] = _se[predPtileCites3]
		local ++row
		local t = _b[predPtileCites3] / _se[predPtileCites3]
		matrix journal_FE_regs[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix journal_FE_regs[`row',`col'] = e(r2)
		local ++row
		sum `Q'
		matrix journal_FE_regs[`row',`col'] = r(mean)
		local ++row
		matrix journal_FE_regs[`row',`col'] = r(N)
		local row = 1
		local ++col
	}

preserve
	clear
	svmat journal_FE_regs
	export excel "${tables}Tables.xlsx", sheet(tableE5) cell(D60) sheetmodify
restore	


/*------------------------------------------------------------------------------

	Table E6: Effect of Potential on Quality Complexity Control Robustness

------------------------------------------------------------------------------*/

* Create 5 bins for each of our complexity measures
foreach control in structureMolecularWt residueCount atomSiteCount {
	egen `control'Bin = xtile(`control'), nq(5)
}

* Create a variable that is the interaction between each of the bins
* 125 bins are possible. In practice, some are empty
gen complexityBin = 100 * structureMolecularWtBin + 10 * residueCountBin + 	///
	atomSiteCountBin
	
* Initialize matrix to store results
matrix complexity_robustness = J(10,6,.)
local row = 1
local col = 1

* Regressions for competition, maturation, quality
foreach y in priorityRace maturation indexStd {
	
	* Main regression, year FEs
	reg `y' predPtileCites3 $time_controls, r
		matrix complexity_robustness[`row',`col'] = _b[predPtileCites3]
		local ++row
		matrix complexity_robustness[`row',`col'] = _se[predPtileCites3]
		local ++row
		local t = _b[predPtileCites3] / _se[predPtileCites3]
		matrix complexity_robustness[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix complexity_robustness[`row',`col'] = e(r2)
		local ++row
	
	* Main regression, year FEs + complexity controls
	reg `y' predPtileCites3 $time_controls i.complexityBin, r
		matrix complexity_robustness[`row',`col'] = _b[predPtileCites3]
		local ++row
		matrix complexity_robustness[`row',`col'] = _se[predPtileCites3]
		local ++row
		local t = _b[predPtileCites3] / _se[predPtileCites3]
		matrix complexity_robustness[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix complexity_robustness[`row',`col'] = e(r2)
		local ++row
		
		sum `y' if e(sample)
		matrix complexity_robustness[`row',`col'] = r(mean)
		local ++row
		matrix complexity_robustness[`row',`col'] = r(N)
		local row = 1
		local ++col
	
	* Main regression, year FEs + PI FEs
	areg `y' predPtileCites3 $time_controls, absorb(lastAuthorId_num) r
		matrix complexity_robustness[`row',`col'] = _b[predPtileCites3]
		local ++row
		matrix complexity_robustness[`row',`col'] = _se[predPtileCites3]
		local ++row
		local t = _b[predPtileCites3] / _se[predPtileCites3]
		matrix complexity_robustness[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix complexity_robustness[`row',`col'] = e(r2)
		local ++row
	
	* Main regression, year FEs + PI FEs  + complexity controls
	areg `y' predPtileCites3 $time_controls i.complexityBin, 				///
	absorb(lastAuthorId_num) r
		matrix complexity_robustness[`row',`col'] = _b[predPtileCites3]
		local ++row
		matrix complexity_robustness[`row',`col'] = _se[predPtileCites3]
		local ++row
		local t = _b[predPtileCites3] / _se[predPtileCites3]
		matrix complexity_robustness[`row', `col'] =  2*ttail(e(df_r), abs(`t'))
		local ++row
		matrix complexity_robustness[`row',`col'] = e(r2)
		local ++row
		
		sum `y' if e(sample)
		matrix complexity_robustness[`row',`col'] = r(mean)
		local ++row
		matrix complexity_robustness[`row',`col'] = r(N)
		local row = 1
		local ++col
}

preserve
	clear
	svmat complexity_robustness
	export excel "${tables}Tables.xlsx", sheet(tableE6) cell(D60) sheetmodify
restore	

/*------------------------------------------------------------------------------

	Figure 4: Effect of Potential on Competition

------------------------------------------------------------------------------*/

* Competition binscatter
binscatter priorityRace predPtileCites3, controls($time_controls) reportreg ///				
	xtitle("Potential" "Predicted three-year citation percentile") 			///
	ytitle("Competition" "Priority race indicator") 						///
	mcolor(navy%30) lcolor(navy)											///
	xlabel(20(20)80) ylabel(0.04(0.02)0.12) 
graph save "${figures}figure4.gph", replace
graph export "${figures}figure4.pdf", replace

/*------------------------------------------------------------------------------

	Figure 5: Effect of Potential on Maturation

------------------------------------------------------------------------------*/
	
* Maturation binscatter
binscatter maturation predPtileCites3, controls($time_controls) reportreg 	///				
	xtitle("Potential" "Predicted three-year citation percentile") 			///
	ytitle("Maturation" "Years between collection and release") 			///
	mcolor(navy%30) lcolor(navy)											///
	xlabel(20(20)80) ylabel(1.5(0.1)1.9)
graph save "${figures}figure5.gph", replace
graph export "${figures}figure5.pdf", replace

/*------------------------------------------------------------------------------

	Figure 6: Effect of Potential on Quality

------------------------------------------------------------------------------*/

* Quality binscatter
binscatter indexStd predPtileCites3, controls($time_controls) reportreg 	///				
	xtitle("Potential" "Predicted three-year citation percentile") 			///
	ytitle("Quality" "Standardized quality index") 							///
	mcolor(navy%30) lcolor(navy)											///
	xlabel(20(20)80) ylabel(-0.6(0.3)0.6)
graph save "${figures}figure6.gph", replace
graph export "${figures}figure6.pdf", replace

/*------------------------------------------------------------------------------

	Figure E5: Effect of Potential on Investment

------------------------------------------------------------------------------*/

* Investment binscatters
binscatter numStructureAuthors predPtileCites3, controls($time_controls) 	///	
	title("Panel A: Number of structure authors")							///			
	xtitle("Potential" "Predicted three-year citation percentile") 			///
	ytitle("Investment") mcolor (navy%30) lcolor(navy)						///
	xlabel(20(20)80) ylabel(4(0.5)6) name(numStructureAuth)

binscatter numPaperAuthors predPtileCites3, controls($time_controls) 		///	
	title("Panel B: Number of paper authors")								///			
	xtitle("Potential" "Predicted three-year citation percentile") 			///
	ytitle("Investment") mcolor (navy%30) lcolor(navy)						///
	xlabel(20(20)80) ylabel(6(0.5)8) name(numPaperAuth)
	
graph combine numStructureAuth numPaperAuth, xsize(6) ysize(2) iscale(*1.8)
graph save "${figures}figureE5.gph", replace
graph export "${figures}figureE5.pdf", replace

/*------------------------------------------------------------------------------

	Figure E6: Effect of Potential on Quality (Additional Outcomes)

------------------------------------------------------------------------------*/

* Additional quality binscatters
binscatter resolutionStd predPtileCites3, controls($time_controls) 			///	
	title("Panel A: Standardized refinement resolution")					///			
	xtitle("Potential" "Predicted three-year citation percentile") 			///
	ytitle("Quality") mcolor (navy%30) lcolor(navy)							///
	xlabel(20(20)80) ylabel(-0.6(0.3)0.6) name(resolutionStd)

binscatter rFreeStd predPtileCites3, controls($time_controls) 				///	
	title("Panel B: Standardized R-free")									///			
	xtitle("Potential" "Predicted three-year citation percentile") 			///
	ytitle("Quality") mcolor (navy%30) lcolor(navy)							///
	xlabel(20(20)80) ylabel(-0.6(0.3)0.6) name(rFreeStd)

binscatter ramaRawStd predPtileCites3, controls($time_controls) 			///	
	title("Panel C: Standardized Ramachandran outliers")					///			
	xtitle("Potential" "Predicted three-year citation percentile") 			///
	ytitle("Quality") mcolor (navy%30) lcolor(navy)							///
	xlabel(20(20)80) ylabel(-0.6(0.3)0.6) name(ramaRawStd)
	
binscatter indexStd predPtileCites3, controls($time_controls) 				///	
	title("Panel D: Standardized index")									///			
	xtitle("Potential" "Predicted three-year citation percentile") 			///
	ytitle("Quality") mcolor (navy%30) lcolor(navy)							///
	xlabel(20(20)80) ylabel(-0.6(0.3)0.6) name(indexStd)
	
graph combine resolutionStd rFreeStd ramaRawStd indexStd
graph save "${figures}figureE6.gph", replace
graph export "${figures}figureE6.pdf", replace



